import pandas as pd
import numpy as np
data = pd.read_csv('kaggle_titanic_data/train.csv')

data = data[['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
print(data)
#均值填写缺失年龄
data['Age'] = data['Age'].fillna(data['data'].mean())



"""
data['Cabin'] = pd.factorize(data.Cabin)[0]
data.fillna(0, inplace=True)


data['Sex'] = [1 if x == 'male' else 0 for x in data.Sex]
data['p1'] = np.array(data['Pclass'] == 1).astype(np.int32)
data['p2'] = np.array(data['Pclass'] == 2).astype(np.int32)
data['p3'] = np.array(data['Pclass'] == 3).astype(np.int32)
del data['Pclass']

data['e1'] = np.array(data['Embarked'] == 'S').astype(np.int32)
data['e2'] = np.array(data['Embarked'] == 'C').astype(np.int32)
data['e3'] = np.array(data['Embarked'] == 'Q').astype(np.int32)
del data['Embarked']

data_train = data[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'p1', 'p2', 'p3', 'e1', 'e2', 'e3']]
data_target = data['Survied'].values.reshape(len(data), 1)
print(np.shape(data_train))
"""